import pandas as pd
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
import numpy
import json
import re
from bokeh.charts import Scatter, Bar, output_notebook, show
from bokeh.plotting import *
try:
# For Python 3.0 and later
from urllib.request import urlopen
except ImportError:
# Fall back to Python 2's urllib2
from urllib2 import urlopen
# from mpl_toolkits.basemap import Basemap
with open('data_links.txt','r+') as f:
urls = f.readlines()
urls = [x.strip('\n') for x in urls]
def inUS(city):
if any([city == 'Phoenix',city== 'Pittsburgh',city == 'Charlotte',city == 'Urbana-Champaign',city=='Las Vegas',city=='Madison']):
return True
else:
return False
def replaceCities(city):
switcher = {
'Phoenix': "Phoenix",
'Pittsburgh': "Pittsburgh",
'Charlotte': "Charlotte",
'Urbana-Champaign': "Urbana-Champaign"
}
return switcher.get(city, "nothing")
def isPrefferedCusine(cuisine):
if 'Restaurants' in cuisine and any(['American' in cuisine , 'Indian' in cuisine,'Chinese' in cuisine ,'Japanese' in cuisine ,'Middle Eastern' in cuisine,'Mexican' in cuisine]):
return True
else:
return False
def replaceCategory(cuisine):
if 'American' in cuisine:
cuisine ='American'
if 'Indian' in cuisine:
cuisine ='Indian'
if 'Chinese' in cuisine:
cuisine ='Chinese'
if 'Japanese' in cuisine:
cuisine ='Japanese'
if 'Middle Eastern' in cuisine:
cuisine ='Middle Eastern'
if 'Mexican' in cuisine:
cuisine ='Mexican'
return cuisine
url_bus = urls[0]
dataset = urlopen(url_bus)
data=[]
for line in dataset:
data.append(json.loads(line))
df_bus = pd.DataFrame(data)
df_bus = df_bus[df_bus.city.apply(inUS) & df_bus.categories.apply(isPrefferedCusine)]
df_bus.categories= df_bus.categories.apply(replaceCategory)
# df_bus['cuisine'] = df_bus['categories'].map(lambda x: re.match('^(\d+)', x).groups()[0])
url_rev = urls[1]
dataset = urlopen(url_rev)
data=[]
for line in dataset:
data.append(json.loads(line))
df_rev = pd.DataFrame(data)
output_notebook()
p = figure(title="Ratings over Review Count")
p.scatter(df_bus['review_count'], df_bus['stars'], marker="circle",
line_color="firebrick", fill_color="blue", fill_alpha=0.5, size=12)
p.xaxis.axis_label = "Number of Reviews"
p.yaxis.axis_label = "Rating"
show(p)
url_check = urls[2]
dataset = urlopen(url_check)
data=[]
for line in dataset:
data.append(json.loads(line))
df_check = pd.DataFrame(data)
url_user = urls[3]
dataset = urlopen(url_user)
data=[]
for line in dataset:
data.append(json.loads(line))
df_user = pd.DataFrame(data)
df_merge = pd.merge(df_rev,df_bus, on='business_id', how='inner')
useful_votes=[]
stars=[]
count = 90000
for i in df_rev.votes.apply(lambda val: dict(val).get('useful')):
useful_votes.append(i)
for i in df_rev['stars']:
stars.append(i)
output_notebook()
p = figure(title="Ratings over 'Useful' Vote Count")
p.scatter(useful_votes[:count], stars[:count], marker="circle",
line_color="firebrick", fill_color="blue", fill_alpha=0.5, size=12)
p.xaxis.axis_label = "Number of 'Useful' Votes"
p.yaxis.axis_label = "Rating"
show(p)
df_merge['year'] = df_merge['date'].map(lambda x: re.match('^(\d+)', x).groups()[0])
output_notebook()
bar1 = Bar(df_bus, label='categories', values='stars', group='city',agg='mean',
title="Avg Star Rating for each Cuisine", legend='top_left')
show(bar1)
output_notebook()
bar = Bar(df_merge[(df_merge['year']=='2012')|(df_merge['year']=='2013') | (df_merge['year']=='2014')], label='categories', values='stars_y', group='year',agg='mean',
title="Avg Star Rating for each Cuisine over years", legend='top_left',bar_width=0.4)
show(bar)
output_notebook()
bar = Bar(df_merge[(df_merge['year']=='2012')|(df_merge['year']=='2013') | (df_merge['year']=='2014')], label='categories', values='review_count', group='year',agg='count',
title="Total Review Count for each Cuisine over years", legend='top_left',bar_width=0.4)
show(bar)
output_notebook()
bar = Bar(df_merge, label='stars_y', values='review_count', group='categories',agg='count',
title="Ratings of Cuisines", xlabel="Ratings", ylabel="No. of Reviews", legend='top_left',bar_width=0.4)
show(bar)
output_notebook()
bar = Bar(df_merge, label='categories', values='business_id',agg='count',group='city',legend='top_left',
title="Total Number of Restaurants or each Cuisine in the Country",ylabel="Total Business", xlabel="Cuisines", bar_width=0.4)
show(bar)
df_merge2 = pd.merge(df_check,df_bus, on='business_id', how='inner')
for chkin in df_merge2.head():
for k,v in chkin.checkin_info.iteritems():
print k,v
import matplotlib.image as mpimg
text=''
for i in df_merge.text[df_merge['categories']=='Japanese']:
text+=re.sub(r"\d+\. ", "", i)
stopwords = STOPWORDS.copy()
wc = WordCloud(max_font_size=40,stopwords=stopwords, margin=10,
random_state=1).generate(text)
default_colors = wc.to_array()
plt.title("Custom colors")
plt.imshow(wc.recolor())
wc.to_file("Japanese.png")
plt.axis("off")
plt.figure()
plt.title("Default colors")
plt.imshow(default_colors)
plt.axis("off")
plt.show()
from IPython.display import Image
Image(filename='Japanese.png')